FaceNewsIntervention - Data Cleaning

Data Preparation

Code
library(tidyverse)
library(easystats)
library(patchwork)
library(ggside)
library(modelsummary)
df <- read.csv("../data/rawdata.csv")  |> 
  mutate(Intervention = ifelse(!is.na(Badnews_Questions_Duration), "BadNews", "Tetris")) |> 
  select(-Prolific_ID, -Date_OSF, -BadNews_Duration)

dfmist <- read.csv("../data/rawdata_mist.csv")

The initial sample consisted of 371 participants (Mean age = 49.7, SD = 101.1, range: [18, 1964], 0.3% missing; Gender: 52.3% women, 45.8% men, 1.89% non-binary; Education: Bachelor, 35.31%; Doctorate, 3.77%; High School, 36.39%; Master, 18.33%; Other, 6.20%), for a total trial number of 371.

Exclusion

Intervention Duration

Code
df |> 
  ggplot(aes(x=Intervention_Duration, fill=Intervention)) +
  geom_density(alpha=0.6) +
  theme_minimal()

Questionnaires

Political Identification

Code
df$Political_LiberalConservative <- ifelse(df$ANES_1 == 4, NA, df$ANES_1)
df$Political_LiberalConservative <- ifelse(df$Political_LiberalConservative > 3, 
                                           df$Political_LiberalConservative - 1, 
                                           df$Political_LiberalConservative)
df$Political_Affiliation <- case_when(df$ANES_2 == 1 ~ "Democrat", 
                                      df$ANES_2 == 2 ~ "Republican", 
                                      df$ANES_2 == 3 ~ "None or Independent", 
                                      .default = "Other")
 
df <- select(df, -starts_with("ANES"))

The data contains 371 observations of the following 2 variables:

  • Political_LiberalConservative: n = 371, Mean = 2.85, SD = 1.79, Median = , MAD = 2.97, range: [0, 6], Skewness = 0.03, Kurtosis = -1.07, 1.08% missing
  • Political_Affiliation: 4 entries, such as Other (34.50%); Republican (33.42%); Democrat (31.81%); None or Independent (0.27%) (0 missing)
Code
df |> 
  filter(!is.na(Political_LiberalConservative) & Political_Affiliation != "None or Independent") |> 
  ggplot(aes(x=Political_LiberalConservative, fill=Political_Affiliation)) +
  geom_density(alpha=0.6) 

Authoritarianism (VSA)

Total Right-Wing Authoritarianism score: the sum of all items divided by 6. Note:Item 1 and 2 measure Conservatism or Authoritarian Submission. Items 3 and 4 measure Traditionalism or Conventionalism. Items 5 and 6 measure Authoritarianism or Authoritarian Aggression.

Code
vsa <- select(df, starts_with("VSA_"), -VSA_Duration) 

plot(summary(correlation(vsa)))

Code
df$VSA_Conservatism <- (datawizard::reverse_scale(df$VSA_1, range = c(0, 8)) + df$VSA_2) / 2
df$VSA_Traditionalism <- (df$VSA_3 + datawizard::reverse_scale(df$VSA_4, range = c(0, 8))) / 2
df$VSA_Authoritarianism <- (datawizard::reverse_scale(df$VSA_5, range = c(0, 8)) + df$VSA_6) / 2
df$VSA_General <- rowMeans(df[, c("VSA_Conservatism", "VSA_Traditionalism", "VSA_Authoritarianism")], na.rm = TRUE)

modelsummary::datasummary_skim(select(df, starts_with("VSA_")))
tinytable_zgfykz2sv8wgozddqspg
Unique Missing Pct. Mean SD Min Median Max Histogram
VSA_Duration 371 0 2.1 11.8 0.1 0.9 223.7
VSA_1 9 0 4.1 2.5 0.0 4.0 8.0
VSA_2 9 0 3.6 2.5 0.0 3.0 8.0
VSA_3 9 0 3.2 2.9 0.0 3.0 8.0
VSA_4 9 0 5.4 2.6 0.0 6.0 8.0
VSA_5 9 0 4.3 2.4 0.0 4.0 8.0
VSA_6 9 0 4.3 2.5 0.0 5.0 8.0
VSA_Conservatism 17 0 3.8 2.1 0.0 3.5 8.0
VSA_Traditionalism 17 0 2.9 2.5 0.0 2.5 8.0
VSA_Authoritarianism 17 0 4.0 2.1 0.0 4.0 8.0
VSA_General 49 0 3.6 1.9 0.0 3.5 8.0

BFI

df$BFI_Agreeableness <- (df$BFI10_2 + (6-df$BFI10_7)) / 2
df$BFI_Extraversion <- (df$BFI10_6 + (6-df$BFI10_1)) / 2
df$BFI_Conscientiousness <- (df$BFI10_8 + (6-df$BFI10_3)) / 2
df$BFI_Neuroticism <- (df$BFI10_9 + (6-df$BFI10_4)) / 2
df$BFI_Openness <- (df$BFI10_10 + (6-df$BFI10_5)) / 2
Code
modelsummary::datasummary_skim(select(df, starts_with("BFI_")))
tinytable_crkpi4ojkicrb6ia8axl
Unique Missing Pct. Mean SD Min Median Max Histogram
BFI_Duration 371 0 1.2 2.6 0.3 0.8 46.7
BFI_Agreeableness 9 0 3.6 1.0 1.0 4.0 5.0
BFI_Extraversion 9 0 2.8 1.1 1.0 3.0 5.0
BFI_Conscientiousness 9 0 4.2 0.9 1.0 4.5 5.0
BFI_Neuroticism 9 0 2.6 1.2 1.0 2.5 5.0
BFI_Openness 9 0 3.8 0.9 1.0 4.0 5.0

GCBS

Code
GCBS <- select(df, starts_with("GCBS"))

df$GCBS_GovernmentMalfeasance <- (df$GCBS15_1 + df$GCBS15_6 + df$GCBS15_11) / 3
df$GCBS_Extraterrestrial <- (df$GCBS15_3 + df$GCBS15_8 + df$GCBS15_13) / 3
df$GCBS_Malevolent <- (df$GCBS15_2 + df$GCBS15_7 + df$GCBS15_12) / 3
df$GCBS_Wellbeing <- (df$GCBS15_4 + df$GCBS15_9 + df$GCBS15_14) / 3
df$GCBS_Control <- (df$GCBS15_5 + df$GCBS15_10 + df$GCBS15_15) / 3
df$GCBS_General <- rowMeans(select(df, starts_with("GCBS_"), -GCBS_Duration), na.rm = TRUE)

plot(summary(correlation(select(df, starts_with("GCBS_"), -GCBS_Duration))))

Code
modelsummary::datasummary_skim(GCBS)
tinytable_mmvsxpplal3z657yerq7
Unique Missing Pct. Mean SD Min Median Max Histogram
GCBS_Duration 371 0 2.8 3.1 0.3 2.0 37.8
GCBS15_8 5 0 1.7 1.3 0.0 1.0 4.0
GCBS15_2 5 0 1.9 1.3 0.0 2.0 4.0
GCBS15_3 5 0 1.4 1.3 0.0 1.0 4.0
GCBS15_6 5 0 1.7 1.3 0.0 1.0 4.0
GCBS15_11 5 0 2.0 1.3 0.0 2.0 4.0
GCBS15_1 5 0 1.9 1.3 0.0 2.0 4.0
GCBS15_5 5 0 1.8 1.3 0.0 1.0 4.0
GCBS15_13 5 0 1.5 1.3 0.0 1.0 4.0
GCBS15_15 5 0 2.8 1.2 0.0 3.0 4.0
GCBS15_12 5 0 1.9 1.3 0.0 2.0 4.0
GCBS15_9 5 0 1.5 1.3 0.0 1.0 4.0
GCBS15_7 5 0 1.8 1.4 0.0 2.0 4.0
GCBS15_14 5 0 1.8 1.3 0.0 2.0 4.0
GCBS15_4 5 0 1.8 1.4 0.0 2.0 4.0
GCBS15_10 5 0 2.2 1.2 0.0 3.0 4.0

MOCRI

Code
## does this give us useful data for both the pre- and post-intervention MOCRI, or do I need to split my coding to be between the bold and nonbold (as this is how the MOCRI is split pre- and post-)?
MOCRI <- select(df, starts_with("MOCRI_"))
df <- select(df, -starts_with("MOCRI"))

# Recode in terms of correct incorrect
# df$MOCRI_Pretest_Manipulative1 <- ifelse(df$MOCRI_BOLD_MOCRI_BOLD_8_Manipulative == 1, 1, 0)

modelsummary::datasummary_skim(MOCRI)
tinytable_hgvje1dobnbyy8bvknsq
Unique Missing Pct. Mean SD Min Median Max Histogram
MOCRI_BOLD_Duration 371 0 2.7 2.2 0.3 2.2 22.2
MOCRI_BOLD_MOCRI_BOLD_4_NonManipulative 2 0 0.8 0.4 0.0 1.0 1.0
MOCRI_BOLD_MOCRI_BOLD_10_Manipulative 2 0 0.3 0.5 0.0 0.0 1.0
MOCRI_BOLD_MOCRI_BOLD_9_NonManipulative 2 0 0.7 0.5 0.0 1.0 1.0
MOCRI_BOLD_MOCRI_BOLD_8_Manipulative 2 0 0.3 0.5 0.0 0.0 1.0
MOCRI_BOLD_MOCRI_BOLD_5_NonManipulative 2 0 0.7 0.5 0.0 1.0 1.0
MOCRI_BOLD_MOCRI_BOLD_11_NonManipulative 2 0 0.6 0.5 0.0 1.0 1.0
MOCRI_BOLD_MOCRI_BOLD_3_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0
MOCRI_BOLD_MOCRI_BOLD_6_Manipulative 2 0 0.3 0.4 0.0 0.0 1.0
MOCRI_BOLD_MOCRI_BOLD_12_Manipulative 2 0 0.5 0.5 0.0 0.0 1.0
MOCRI_BOLD_MOCRI_BOLD_2_NonManipulative 2 0 0.7 0.4 0.0 1.0 1.0
MOCRI_BOLD_MOCRI_BOLD_7_NonManipulative 2 0 0.7 0.4 0.0 1.0 1.0
MOCRI_BOLD_MOCRI_BOLD_1_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0
MOCRI_NONBOLD_Duration 369 0 2.0 1.8 0.3 1.6 17.1
MOCRI_NONBOLD_MOCRI_10_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0
MOCRI_NONBOLD_MOCRI_6_Manipulative 2 0 0.3 0.4 0.0 0.0 1.0
MOCRI_NONBOLD_MOCRI_2_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0
MOCRI_NONBOLD_MOCRI_7_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0
MOCRI_NONBOLD_MOCRI_9_NonManipulative 2 0 0.8 0.4 0.0 1.0 1.0
MOCRI_NONBOLD_MOCRI_1_NonManipulative 2 0 0.9 0.4 0.0 1.0 1.0
MOCRI_NONBOLD_MOCRI_3_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0
MOCRI_NONBOLD_MOCRI_5_NonManipulative 2 0 0.7 0.4 0.0 1.0 1.0
MOCRI_NONBOLD_MOCRI_11_NonManipulative 2 0 0.6 0.5 0.0 1.0 1.0
MOCRI_NONBOLD_MOCRI_12_NonManipulative 2 0 0.6 0.5 0.0 1.0 1.0
MOCRI_NONBOLD_MOCRI_4_NonManipulative 2 0 0.8 0.4 0.0 1.0 1.0
MOCRI_NONBOLD_MOCRI_8_Manipulative 2 0 0.2 0.4 0.0 0.0 1.0

MIST

Code
dfmist <- dfmist |>
  full_join(df[, c("Participant", "Intervention")], by = "Participant") |>
  mutate(temp = Item) |> 
  separate(temp, into = c("extra", "QuestionType", "Topic", "QuestionID"), sep = "_")  |> 
  select(-extra, -QuestionID) |> 
    mutate(
    Correct = case_when(
      MIST > 50 & QuestionType == "real" ~ 1,
      MIST < 50 & QuestionType == "fake" ~ 1,
      .default = 0
    ),
    True_Positive = ifelse(MIST > 50 & QuestionType == "real", 1, 0),
    True_Negative = ifelse(MIST < 50 & QuestionType == "fake", 1, 0),
    False_Positive = ifelse(MIST > 50 & QuestionType == "fake", 1, 0),
    False_Negative = ifelse(MIST < 50 & QuestionType == "real", 1, 0)
    )

Items

Code
# Participant scores
dfmist |> 
  summarise(p_Correct = sum(Correct) / n(), .by=c("Item")) |> 
  arrange(p_Correct) |> 
  gt::gt() |> 
  gt::fmt_number(columns = "p_Correct", decimals = 2) |> 
  gt::data_color(columns = "p_Correct", palette=c("red", "green"))
Item p_Correct
MIST_real_general_14 0.09
MIST_real_general_12 0.14
MIST_fake_general_10 0.15
MIST_real_general_11 0.18
MIST_fake_covid_1 0.22
MIST_real_general_10 0.22
MIST_fake_covid_6 0.26
MIST_fake_covid_5 0.27
MIST_fake_general_12 0.30
MIST_fake_covid_7 0.30
MIST_real_general_7 0.31
MIST_fake_covid_9 0.32
MIST_fake_covid_10 0.32
MIST_real_general_13 0.33
MIST_fake_covid_3 0.34
MIST_real_general_8 0.40
MIST_real_general_4 0.41
MIST_fake_covid_8 0.43
MIST_fake_general_2 0.46
MIST_fake_covid_2 0.47
MIST_real_general_9 0.49
MIST_real_general_2 0.54
MIST_fake_general_8 0.57
MIST_fake_general_26 0.58
MIST_real_covid_7 0.58
MIST_real_covid_1 0.61
MIST_fake_general_18 0.61
MIST_fake_general_30 0.62
MIST_fake_general_19 0.63
MIST_real_general_5 0.63
MIST_real_general_31 0.63
MIST_fake_general_13 0.64
MIST_fake_general_21 0.64
MIST_fake_general_29 0.64
MIST_real_covid_4 0.65
MIST_fake_general_22 0.65
MIST_fake_general_15 0.68
MIST_fake_general_25 0.68
MIST_fake_general_24 0.68
MIST_real_general_18 0.68
MIST_real_general_25 0.69
MIST_real_general_30 0.70
MIST_fake_general_14 0.70
MIST_fake_general_20 0.70
MIST_real_general_3 0.70
MIST_real_general_20 0.70
MIST_real_general_19 0.70
MIST_fake_general_6 0.71
MIST_real_general_26 0.72
MIST_fake_general_27 0.72
MIST_real_general_22 0.72
MIST_fake_general_28 0.73
MIST_real_general_17 0.73
MIST_real_general_1 0.73
MIST_real_general_16 0.73
MIST_fake_general_4 0.74
MIST_fake_general_3 0.75
MIST_fake_general_11 0.75
MIST_fake_general_5 0.75
MIST_real_covid_5 0.76
MIST_real_covid_6 0.77
MIST_real_covid_3 0.78
MIST_fake_general_1 0.78
MIST_real_general_15 0.78
MIST_fake_general_17 0.79
MIST_fake_general_9 0.79
MIST_real_general_28 0.79
MIST_fake_general_7 0.80
MIST_real_general_24 0.80
MIST_real_general_27 0.81
MIST_real_covid_10 0.81
MIST_fake_general_16 0.81
MIST_real_covid_9 0.81
MIST_real_general_23 0.83
MIST_fake_covid_4 0.85
MIST_fake_general_23 0.86
MIST_real_general_29 0.86
MIST_real_covid_8 0.87
MIST_real_general_21 0.87
MIST_real_general_6 0.87
MIST_real_covid_2 0.91

Scores

Code
compute_dprime <- function(data) {
  # Calculate hit rate and false alarm rate
  H <- (data$True_Positive + 0.5) / (data$True_Positive + data$False_Negative + 1)  # Adjusted Hit Rate
  FA <- (data$False_Positive + 0.5) / (data$False_Positive + data$True_Negative + 1) # Adjusted False Alarm Rate
  
  # Parametric ----
  # Compute z-scores
  zH <- qnorm(H) # z-score for hit rate
  zFA <- qnorm(FA) # z-score for false alarm rate
  
  # d' and criterion
  d_prime <- zH - zFA
  criterion <- -0.5 * (zH + zFA)
  
  # Non-parametric ----
  # A' (A-prime)
  A_prime <- ifelse(
    H > FA,
    0.5 + ((H - FA) * (1 + H - FA)) / (4 * H * (1 - FA)),
    ifelse(
      H < FA,
      0.5 + ((FA - H) * (1 + FA - H)) / (4 * FA * (1 - H)),
      0.5
    )
  )
  
  # B''d
  B_double_prime <- ifelse(
    H != FA,
    ((1 - H) * (1 - FA) - H * FA) / ((1 - H) * (1 - FA) + H * FA),
    0
  )
  
  # Combine results
  cbind(data, data.frame(
    "dprime" = d_prime,
    "criterion" = criterion,
    "aprime" = A_prime,
    "bppd" = B_double_prime
  ))
}


# Participant scores
df <- dfmist |> 
  summarise(
    correct = sum(Correct) / n(),
    True_Positive = sum(True_Positive),
    True_Negative = sum(True_Negative),
    False_Positive = sum(False_Positive),
    False_Negative = sum(False_Negative),
    .by = c("Participant", "Condition", "Topic")
  ) |> 
  compute_dprime() |>  
  select(Participant, Condition, Topic, correct, dprime, criterion, aprime, bppd) |>
  pivot_wider(names_from=c("Condition", "Topic"), 
              values_from=c("correct", "dprime", "criterion", "aprime", "bppd"),
              names_vary = "slowest") |> 
  mutate(correct_Diff_covid = correct_Posttest_covid - correct_Pretest_covid,
         correct_Diff_general = correct_Posttest_general - correct_Pretest_general,
         dprime_Diff_covid =  dprime_Posttest_covid - dprime_Pretest_covid,
         dprime_Diff_general = dprime_Posttest_general - dprime_Pretest_general,
         criterion_Diff_covid = criterion_Posttest_covid - criterion_Pretest_covid,
         criterion_Diff_general = criterion_Posttest_general - criterion_Pretest_general,
         aprime_Diff_covid = aprime_Posttest_covid - aprime_Pretest_covid,
         aprime_Diff_general = aprime_Posttest_general - aprime_Pretest_general,
         bppd_Diff_covid = bppd_Posttest_covid - bppd_Pretest_covid,
         bppd_Diff_general = bppd_Posttest_general - bppd_Pretest_general) |>
  datawizard::data_addprefix("MIST_", select=-Participant) |>  
  full_join(df, by="Participant")

Final Sample

Gender

Code
ggplot(df, aes(x = Gender)) +
    geom_bar(fill = "skyblue") +
    labs(title = "Distribution of Gender", x = "Gender", y = "Count")

Code
## RACE
# this won't currently work as it's pulling from the self-identification ethnicity column which has far too many distinct entries to be represented well visually. I am not sure how we could get the basic racial data from prolific but it's also not useful for any actual testing we want to do, so probably wouldn't worry about it
#ggplot(df, aes(x = Ethnicity)) +
 # geom_bar(fill = "lightgreen") +
 # labs(title = "Distribution of Race", x = "Race", y = "Count")

Education

Code
# Create the bar chart for Education
ggplot(df, aes(x = Education)) +
  geom_bar(fill = "lightblue") +
  labs(title = "Distribution of Education Level",
       x = "Education Level",
       y = "Count") +
  theme_minimal()  # Optional: Adds a minimal theme for better aesthetics

Save

Code
write.csv(df, "../data/data.csv")
write.csv(dfmist, "../data/data_mist.csv")